{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "source": [
    "# Character based RNN language model\n",
    "(c) Deniz Yuret, 2019. Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* Objectives: Learn to define and train a character based language model and generate text from it. Minibatch blocks of text. Keep a persistent RNN state between updates. Train a Shakespeare generator and a Julia programmer using the same type of model.\n",
    "* Prerequisites: [RNN basics](60.rnn.ipynb), [Iterators](25.iterators.ipynb)\n",
    "* New functions:\n",
    "[converge](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.converge)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set display width, load packages, import symbols\n",
    "ENV[\"COLUMNS\"]=72\n",
    "using Pkg; haskey(Pkg.installed(),\"Knet\") || Pkg.add(\"Knet\")\n",
    "using Statistics: mean\n",
    "using Base.Iterators: cycle\n",
    "using Knet: Knet, AutoGrad, Data, param, param0, mat, RNN, dropout, value, nll, adam, minibatch, progress!, converge"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "struct Embed; w; end\n",
    "\n",
    "Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))\n",
    "\n",
    "(e::Embed)(x) = e.w[:,x]  # (B,T)->(X,B,T)->rnn->(H,B,T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "struct Linear; w; b; end\n",
    "\n",
    "Linear(input::Int, output::Int)=Linear(param(output,input), param0(output))\n",
    "\n",
    "(l::Linear)(x) = l.w * mat(x,dims=1) .+ l.b  # (H,B,T)->(H,B*T)->(V,B*T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Let's define a chain of layers\n",
    "struct Chain\n",
    "    layers\n",
    "    Chain(layers...) = new(layers)\n",
    "end\n",
    "(c::Chain)(x) = (for l in c.layers; x = l(x); end; x)\n",
    "(c::Chain)(x,y) = nll(c(x),y)\n",
    "(c::Chain)(d::Data) = mean(c(x,y) for (x,y) in d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CharLM (generic function with 1 method)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# The h=0,c=0 options to RNN enable a persistent state between iterations\n",
    "CharLM(vocab::Int,embed::Int,hidden::Int; o...) = \n",
    "    Chain(Embed(vocab,embed), RNN(embed,hidden;h=0,c=0,o...), Linear(hidden,vocab))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train and test utilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# For running experiments\n",
    "function trainresults(file,maker,chars)\n",
    "    if (print(\"Train from scratch? \"); readline()[1]=='y')\n",
    "        model = maker()\n",
    "        a = adam(model,cycle(dtrn))\n",
    "        b = (exp(model(dtst)) for _ in every(100,a))\n",
    "        c = converge(b, alpha=0.1)\n",
    "        progress!(c, alpha=1)\n",
    "        Knet.save(file,\"model\",model,\"chars\",chars)\n",
    "    else\n",
    "        isfile(file) || download(\"http://people.csail.mit.edu/deniz/models/tutorial/$file\",file)\n",
    "        model,chars = Knet.load(file,\"model\",\"chars\")\n",
    "    end\n",
    "    Knet.gc() # To save gpu memory\n",
    "    return model,chars\n",
    "end\n",
    "\n",
    "every(n,itr) = (x for (i,x) in enumerate(itr) if i%n == 0);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "# To generate text from trained models\n",
    "function generate(model,chars,n)\n",
    "    function sample(y)\n",
    "        p = Array(exp.(y)); r = rand()*sum(p)\n",
    "        for j=1:length(p); (r -= p[j]) < 0 && return j; end\n",
    "    end\n",
    "    x = 1\n",
    "    reset!(model)\n",
    "    for i=1:n\n",
    "        y = model([x])\n",
    "        x = sample(y)\n",
    "        print(chars[x])\n",
    "    end\n",
    "    println()\n",
    "end\n",
    "\n",
    "reset!(m::Chain)=(for r in m.layers; r isa RNN && (r.c=r.h=0); end);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Complete Works of William Shakespeare"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [],
   "source": [
    "RNNTYPE = :lstm\n",
    "BATCHSIZE = 256\n",
    "SEQLENGTH = 100\n",
    "VOCABSIZE = 84\n",
    "INPUTSIZE = 168\n",
    "HIDDENSIZE = 334\n",
    "NUMLAYERS = 1;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "┌ Info: Downloading Gutenberg shaks12\n",
      "└ @ Main /home/jrun/.julia/packages/Knet/T1oum/data/gutenberg.jl:18\n",
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 5451k  100 5451k    0     0  11.8M      0 --:--:-- --:--:-- --:--:-- 11.8M\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(\"4934845-element Array{UInt8,1}\", \"526731-element Array{UInt8,1}\", \"84-element Array{Char,1}\")"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Load 'The Complete Works of William Shakespeare'\n",
    "include(Knet.dir(\"data\",\"gutenberg.jl\"))\n",
    "trn,tst,shakechars = shakespeare()\n",
    "map(summary,(trn,tst,shakechars))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "slideshow": {
     "slide_type": "fragment"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r\n",
      "    Cheated of feature by dissembling nature,\r\n",
      "    Deform'd, unfinish'd, sent before my time\r\n",
      "    Into this breathing world scarce half made up,\r\n",
      "    And that so lamely and unfashionable\r\n",
      " \n"
     ]
    }
   ],
   "source": [
    "# Print a sample\n",
    "println(string(shakechars[trn[1020:1210]]...))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(192, 20)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Minibatch data\n",
    "function mb(a)\n",
    "    N = length(a) ÷ BATCHSIZE\n",
    "    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows\n",
    "    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks \n",
    "end\n",
    "dtrn,dtst = mb.((trn,tst))\n",
    "length.((dtrn,dtst))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(\"256×100 Array{UInt8,2}\", \"256×100 Array{UInt8,2}\")"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train from scratch? stdin> n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 25.2M  100 25.2M    0     0  7616k      0  0:00:03  0:00:03 --:--:-- 7617k 0:00:05 --:--:--  0:00:05 4758k\n"
     ]
    }
   ],
   "source": [
    "# 3.30e+00  ┣   /       /       /       /       /    ┫ 122 [04:46, 2.35s/i]\n",
    "Knet.gc()\n",
    "shakemaker() = CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS)\n",
    "shakemodel,shakechars = trainresults(\"shakespeare113.jld2\", shakemaker, shakechars);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "#exp(shakemodel(dtst))  # Perplexity = 3.30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "slideshow": {
     "slide_type": "slide"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "BENTMWIS, DUMAIN,\n",
      "Rogs, with AWVIRS OF STINCE, COMINIUS,\n",
      "                       EROS, and from country Rome\n",
      "  KING HANGHRYO, a glorious colours, widow above.\n",
      "\n",
      "  Hot. Why, the claim of the law is in little fault.\n",
      "     Leop to Pohtory would be them present.\n",
      "    If she enter laugh in an angry valiant comfort.\n",
      "    How purpose me not so to the tale feed\n",
      "    A man, or, General, as we can lee.  \n",
      "    Say, gentle my grief, if the best king of them\n",
      "    The same-long for my vouchas view to you.\n",
      "  Ham. I'll on Huberth; half quickets' impossible viles it is.\n",
      "  OTHELLO. We think they will show me he id, I speak,  \n",
      "    I would hide you, good spearer. Meantimes his\n",
      "    Syrand to my consoverehood and ranowed,\n",
      "    Not a different. Come, can by temper it.\n",
      "    Look down are young Humphrey and Suffolk,\n",
      "    There have felast at full enmioder;\n",
      "    Begin to England.\n",
      "  Gon. How does the name that little extrebutians sworn?\n",
      "    I see th' earth so discourse Monnum's men\n",
      "    In me, the paper\n"
     ]
    }
   ],
   "source": [
    "generate(shakemodel,shakechars,1000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Julia programmer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "RNNTYPE = :lstm\n",
    "BATCHSIZE = 64\n",
    "SEQLENGTH = 64\n",
    "INPUTSIZE = 512\n",
    "VOCABSIZE = 128\n",
    "HIDDENSIZE = 512\n",
    "NUMLAYERS = 2;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10796286"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read julia base library source code\n",
    "base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, \"julia\")\n",
    "text = \"\"\n",
    "for (root,dirs,files) in walkdir(base)\n",
    "    for f in files\n",
    "        f[end-2:end] == \".jl\" || continue\n",
    "        text *= read(joinpath(root,f), String)\n",
    "    end\n",
    "    # println((root,length(files),all(f->contains(f,\".jl\"),files)))\n",
    "end\n",
    "length(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3664×2 Array{Any,2}:\n",
       " ' '   2332771\n",
       " 'e'    647469\n",
       " 't'    557298\n",
       " 'n'    401708\n",
       " 'r'    400046\n",
       " 'i'    388728\n",
       " 's'    383550\n",
       " 'a'    373814\n",
       " 'o'    328822\n",
       " '\\n'   313058\n",
       " 'l'    237717\n",
       " ','    233825\n",
       " ')'    218411\n",
       " ⋮            \n",
       " 'ה'         1\n",
       " '🍢'         1\n",
       " '𝗾'         1\n",
       " '𝔔'         1\n",
       " 'É'         1\n",
       " '𝓟'         1\n",
       " '𝚿'         1\n",
       " '𝕨'         1\n",
       " 'ɛ'         1\n",
       " 'Χ'         1\n",
       " '🕙'         1\n",
       " 'ℚ'         1"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Find unique chars, sort by frequency, assign integer ids.\n",
    "charcnt = Dict{Char,Int}()\n",
    "for c in text; charcnt[c]=1+get(charcnt,c,0); end\n",
    "juliachars = sort(collect(keys(charcnt)), by=(x->charcnt[x]), rev=true)\n",
    "charid = Dict{Char,Int}()\n",
    "for i=1:length(juliachars); charid[juliachars[i]]=i; end\n",
    "hcat(juliachars, map(c->charcnt[c],juliachars))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10796286, 10271998, 524288)"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Keep only VOCABSIZE most frequent chars, split into train and test\n",
    "data = map(c->charid[c], collect(text))\n",
    "data[data .> VOCABSIZE] .= VOCABSIZE\n",
    "ntst = 1<<19\n",
    "tst = data[1:ntst]\n",
    "trn = data[1+ntst:end]\n",
    "length.((data,trn,tst))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "izeof_constvec, ())[1]), Core.sizeof, 2)\n",
      "push!(constvec, 10)\n",
      "@test @inferred(sizeof_constvec()) == sizeof(Int) * 4\n",
      "\n",
      "test_const_return((x)->isdefined(x, :re), Tuple{ComplexF64}, true)\n",
      "isdefined_f3(x) = isdefined(x, 3)\n",
      "@test @inferred(isdefined_f3(())) == false\n",
      "@test find_call(first(code_typed(isdefined_f3, Tuple{Tuple{Vararg{Int}}})[1]), isdefined, 3)\n",
      "\n",
      "let isa_tfunc = Core.Compiler.T_FFUNC_VAL[\n",
      "        findfirst(x->x===isa, Core.Compiler.T_FFUNC_KEY)][3]\n",
      "    @test isa_tfunc(Array, Const(AbstractArray)) === Const(true)\n",
      "    @test isa_tfunc(Array, Type{AbstractArray}) === Const(true)\n",
      "    @test isa_tfunc(Array, Type{AbstractArray{Int}}) == Bool\n",
      "    @test isa_tfunc(Array{Real}, Type{AbstractArray{Int}}) === Const(false)\n",
      "    @test isa_tfunc(Array{Real, 2}, Const(AbstractArray{Real, 2})) === Const(true)\n",
      "    @test isa_tfunc(Array{Real, 2}, Const(AbstractArray{Int, 2})) === Const(false)\n",
      "    @test isa_tfunc(DataType, Int) === Union{}\n",
      "    @test isa_tfunc(DataType, Const(Type{Int})) === Bool\n",
      "    @te\n"
     ]
    }
   ],
   "source": [
    "# Print a sample\n",
    "r = rand(1:(length(trn)-1000))\n",
    "println(string(juliachars[trn[r:r+1000]]...)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(2507, 127)"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Minibatch data\n",
    "function mb(a)\n",
    "    N = length(a) ÷ BATCHSIZE\n",
    "    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # reshape full data to (B,N) with contiguous rows\n",
    "    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # split into (B,T) blocks \n",
    "end\n",
    "dtrn,dtst = mb.((trn,tst))\n",
    "length.((dtrn,dtst))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(\"64×64 Array{Int64,2}\", \"64×64 Array{Int64,2}\")"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summary.(first(dtrn))  # each x and y have dimensions (BATCHSIZE,SEQLENGTH)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train from scratch? stdin> n\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 58.1M  100 58.1M    0     0  7830k      0  0:00:07  0:00:07 --:--:-- 8427k\n"
     ]
    }
   ],
   "source": [
    "# 3.25e+00  ┣       /       /       /       /       /┫ 126 [05:43, 2.72s/i]\n",
    "juliamaker() = CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS)\n",
    "juliamodel,juliachars = trainresults(\"juliacharlm113.jld2\", juliamaker, juliachars);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "#exp(juliamodel(dtst))  # Perplexity = 3.27"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     # 1.0, 11,10,591,118,118,11720,\n",
      "                    162,158,199,199,178]\n",
      "        @test precision(infinite) <: UnitRange{Int}\n",
      "        @test !(Type{UnitRange{Int}, Int8} NaN)\n",
      "        @test typeof(copy(UnitRange(-1))) == Const(true)\n",
      "        @test preserve(T, s) == subidity ? [preserved] for p in nn - 7-2:5; q = typemax(UInt)\n",
      "    ends = 0\n",
      "    # @test isequal(submask) == 0\n",
      "    @test isequal(ntoh(u)+>2, count => talename == difff.setfalloc_format = 1; @test_throws DimensionMismatch rdivg(none_enulir)\n",
      "    @test (Pair{TimeVal{Int}}(1)) == 7\n",
      "    @test a == 1\n",
      "    @test typeof(eigvals(Const(0))) === Union{Distributed.Worker, RebuitOptionSpecs},\n",
      "        Random.seed!(1234); wait(res)\n",
      "        @test invalid_key_args(new(Vector{Any}(zeros(24):Dims{2})) ==\n",
      "            vec_array(Any[false, false,1], Vararg{Any, NaN})\n",
      "        @test err == a1 || ret_tR2_nt = readdlm(IOBuffer(\"1\",1))\n",
      "        @test_throws ArgumentError Base.unwrap_unionall(Vector{Int}())\n",
      "    end\n",
      "\n",
      "    @test nothing            # operatio\n"
     ]
    }
   ],
   "source": [
    "generate(juliamodel,juliachars,1000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "collapsed_sections": [],
   "name": "julia.ipynb",
   "provenance": [],
   "version": "0.3.2"
  },
  "kernelspec": {
   "display_name": "Julia 1.0.3",
   "language": "julia",
   "name": "julia-1.0"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "1.0.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
